Packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RSQLite)
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(readr)
library(DBI)
library(devtools)
## Loading required package: usethis
library(inborutils) # for large files
library(rio)
library(knitr)
library(broom)

Reading the Data

# shouldn't need this after writing to SQLite db
# Ideas for larger DB in csv: bigmatrix package
# dat <- readRDS("C:/Users/Chris/OneDrive/R project/Open Policing/yg821jf8611_ca_statewide_2020_04_01.rds")

This is my first time encountering a large dataset (3 million rows). After much stumbling in the dark and many articles I’ve linked the file to a SQLite database, made an object that is a subset of the file (20,000 rows) for initial analysis, then saved it as an RDS object that isn’t so big.

# Ok SQLite does not have a storage class for dates or times, but it seems reasonably fast
library(dplyr)
#The code below should only need to be done once:


#file_name <- "California_Policing"
# sqldb <- dbConnect(SQLite(), dbname = file_name)
# Writing
# dbWriteTable(sqldb, name = "Calfornia_Statewide_Policing", dat, row.names = FALSE, overwrite = TRUE, append = FALSE, field.types = NULL)

# Reading, only need to do once. Taking the first 20,000 rows.
#df <- tbl(sqldb, "Calfornia_Statewide_Policing") %>% 
#  select(-date, -raw_row_number) %>% 
#  filter(row_number() %in% c(1:20000)) %>% 
#  collect()
# saving it as a RDS file for future use
# saveRDS(df,"C:/Users/Chris/OneDrive/R project/Open Policing/Open Policing/df.rds")
# Ok this is our regular working object for now: 20,000 rows out of 3 million to work with 
df <- readRDS("C:/Users/Chris/OneDrive/R project/Open Policing/Open Policing/df.rds")
# dbBegin(db) begins a transaction
# dbRollback(db) roll back reverts to original state
# dbCommit(db) 'commits' the data
# Tidbit for future use
#The code below is an example for if you need to copy data over into a database:
#copy_to(con, nycflights13::flights, "flights",
#  temporary = FALSE, 
#  indexes = list(
#    c("year", "month", "day"), 
#    "carrier", 
#    "tailnum",
#    "dest"
#  )
#)

Exporatory Analysis

library(DataExplorer)

Going to use glimpse on the data to get a look at the data. Glimpse reveals various location, vehicle, warning, and race information. Our variables are mostly categorical, with a lot of NA’s that I might want to replace with 0’s.

glimpse(df)
## Rows: 20,000
## Columns: 19
## $ county_name      <chr> "Stanislaus County", "Stanislaus County", "Stanislaus~
## $ district         <chr> "Modesto", "Modesto", "Modesto", "Modesto", "Modesto"~
## $ subject_race     <chr> "other", "hispanic", "hispanic", "white", "hispanic",~
## $ subject_sex      <chr> "male", "female", "female", "female", "male", "male",~
## $ department_name  <chr> "California Highway Patrol", "California Highway Patr~
## $ type             <chr> "vehicular", "vehicular", "vehicular", "vehicular", "~
## $ violation        <chr> "Motorist / Public Service", "Moving Violation (VC)",~
## $ arrest_made      <int> NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, ~
## $ citation_issued  <int> NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, ~
## $ warning_issued   <int> NA, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, NA, ~
## $ outcome          <chr> NA, "summons", "summons", "summons", "summons", "summ~
## $ contraband_found <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ frisk_performed  <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ search_conducted <int> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ search_person    <int> 0, 0, NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ search_basis     <chr> NA, NA, "other", NA, "other", NA, NA, NA, NA, NA, NA,~
## $ reason_for_stop  <chr> "Motorist / Public Service", "Moving Violation (VC)",~
## $ raw_race         <chr> "Other", "Hispanic", "Hispanic", "White", "Hispanic",~
## $ raw_search_basis <chr> "Vehicle Inventory", "Probable Cause (positive)", "Pr~
# This is unused but it's nice to know the way to find date range
#dat %>% 
#  select(date) %>% 
#  summarise(date_range = max(date) - min(date))
plot_str(df)

I’m not sure how nice this graph looks to be honest - will probably delete.

introduce(df)
## # A tibble: 1 x 9
##    rows columns discrete_columns continuous_columns all_missing_columns
##   <int>   <int>            <int>              <int>               <int>
## 1 20000      19               12                  7                   0
## # ... with 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>

Introduce is very useful, it’s telling me that there are a lot of missing values. The memory usage is in byes which is 2.5MB. Perhaps I could select more than 20,000 rows.

plot_missing(df)

Whilst the NA’s in district, county_name and search_person seem to be genuine missing data, the other variables seem to be using NA as a geuine outcome. Let’s take a look at these columns:

df %>% select(outcome, warning_issued, citation_issued, arrest_made, search_basis, contraband_found, frisk_performed) %>% 
  distinct()
## # A tibble: 19 x 7
##    outcome  warning_issued citation_issued arrest_made search_basis  
##    <chr>             <int>           <int>       <int> <chr>         
##  1 <NA>                 NA              NA          NA <NA>          
##  2 summons               0               0           0 <NA>          
##  3 summons               0               0           0 other         
##  4 warning               1               0           0 <NA>          
##  5 citation              0               1           0 <NA>          
##  6 arrest                0               0           1 other         
##  7 <NA>                 NA              NA          NA other         
##  8 arrest                0               0           1 <NA>          
##  9 <NA>                 NA              NA          NA other         
## 10 summons               0               0           0 other         
## 11 summons               0               0           0 probable cause
## 12 citation              0               1           0 other         
## 13 warning               1               0           0 other         
## 14 summons               0               0           0 probable cause
## 15 arrest                0               0           1 consent       
## 16 arrest                0               0           1 probable cause
## 17 warning               1               0           0 probable cause
## 18 <NA>                 NA              NA          NA probable cause
## 19 arrest                0               0           1 probable cause
## # ... with 2 more variables: contraband_found <int>, frisk_performed <int>

The frisk_performed column has only NA’s and 1’s. We can treat the NA’s as 0 i.e no frisk performed. arrest_made, citation_issued, warning_issued, outcome tend to have NA’s together for a row. I believe it is a reasonable assumption that nothing occured during these pull overs. Therefore we can replace these NA’s with 0’s. The search_basis is giving us ‘other’ or NA, so we should probably remove this column. Search_person can also replace NA’s with 0’s.

# Modifying all these NA entries:
df <- df %>% 
  replace_na(list(outcome = "nothing", warning_issued = 0, arrest_made = 0, citation_issued = 0, warning_issued = 0, contraband_found = 0, frisk_performed = 0, search_conducted = 0 )) %>% 
  select(-search_basis)
df <- df %>% replace_na(list(search_person = 0))
plot_missing(df)

The district and county_name entries with NA entries can be treated as unusable data, we can remove them.

df <- df %>% 
  na.omit()
plot_missing(df)

And we’re done cleaning the NA!

Correlation plot

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:rio':
## 
##     export
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose

We want the numeric data only for the correlation matrix, but first we need to turn some categorical variables into numeric:

Indtest <- df %>% 
  group_by(subject_race,outcome) %>% 
  summarise(n = n()) %>% 
  spread(outcome, n)
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
Indtest
## # A tibble: 5 x 6
## # Groups:   subject_race [5]
##   subject_race           arrest citation nothing summons warning
##   <chr>                   <int>    <int>   <int>   <int>   <int>
## 1 asian/pacific islander     12       15     394     676     150
## 2 black                      34       25     481     868     223
## 3 hispanic                  115      213    2084    3232     733
## 4 other                       6       40     341     759     133
## 5 white                     144      231    3125    4542    1362
df %>% 
  group_by(subject_race,outcome) %>%  summarise(n = n()) %>%  mutate(chisq_pval = chisq.test(n)$p.value)
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
## # A tibble: 25 x 4
## # Groups:   subject_race [5]
##    subject_race           outcome      n chisq_pval
##    <chr>                  <chr>    <int>      <dbl>
##  1 asian/pacific islander arrest      12  4.41e-280
##  2 asian/pacific islander citation    15  4.41e-280
##  3 asian/pacific islander nothing    394  4.41e-280
##  4 asian/pacific islander summons    676  4.41e-280
##  5 asian/pacific islander warning    150  4.41e-280
##  6 black                  arrest      34  0        
##  7 black                  citation    25  0        
##  8 black                  nothing    481  0        
##  9 black                  summons    868  0        
## 10 black                  warning    223  0        
## # ... with 15 more rows
df %>% distinct(subject_race)
## # A tibble: 5 x 1
##   subject_race          
##   <chr>                 
## 1 other                 
## 2 hispanic              
## 3 white                 
## 4 black                 
## 5 asian/pacific islander
# In the order of: asian/islander, black, hispanic, other, white
demo <- c(0.1452+0.0036, 0.0551, 0.3929, 0.0368, 0.3664)
# Numbers are taken from https://en.wikipedia.org/wiki/Demographics_of_California#/media/File:Ethic_California_Organized_Pie.png. The other category I obtained from 1 - sum(demo).

Categorical Data Analysis

Visualising the covariation between two categorical variables

ggplot(data = df) +
  geom_count(mapping = aes(x = subject_race, y = outcome, color = ..n.., size = ..n..)) +
  scale_size_area() +
  scale_size_continuous(range = c(1,10)) +
  ggtitle("Covariation Between Outcome and Race") +
  labs(x ="Race of Subject", y = "Outcome") +
  guides(color = "legend")
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.

While this is not the most informative graph, it is interesting to note that quite few direct arrests. Most of the outcomes are summons or nothing. As one can expect, the circles are largest for the hispanic and white groups - the two groups with the largest samples. Let’s do a proportion graph:

ggplot(data = df) +
  geom_count(mapping = aes(x = subject_race, y = outcome, color = ..prop.., size = ..prop.., group = 1)) +
  scale_size_area() +
  scale_size_continuous(range = c(1,10)) +
  ggtitle("Covariation Between Outcome and Race") +
  labs(x ="Race of Subject", y = "Outcome") +
  guides(color = "legend")
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.

Show’s the same stuff, but it’s nice to know it’s easy to go between the two. We may also be interested in a heatmap version:

df %>% 
  count(subject_race, outcome) %>% 
ggplot(aes(x = subject_race, y = outcome)) +
  geom_tile(aes(fill = n))

library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.2
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(seriation)
## Warning: package 'seriation' was built under R version 4.1.2
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
nrow(unique(df %>% count(subject_race,outcome)))
## [1] 25
# 25
colours <- colorRampPalette(c("blue", "green", "red"))(25)
df %>% 
  count(subject_race, outcome) %>% 
ggplot(aes(x = subject_race, y = outcome)) +
  geom_tile(aes(fill = n)) +
  scale_fill_distiller(palette = "RdPu") 

#  theme_ipsum() moves axis labels to the side
#  Other options
#  scale_fill_gradient(low = "White", high = "blue")
#
 
#  scale_fill_brewer(palette = "PRGn") # scale_fill_brewer requires factor for fill. Ok it's limited to 11 different facotrs this is better for something that is discrete

Same information is displayed but it’s definitely a more visually engaging method. The larger numbers of summons +hispanic/white really pop out. #### Test of single proportion

df %>% 
  group_by(subject_race) %>% 
  summarise(n = n()) %>% 
  mutate(rsum = sum(n))
## # A tibble: 5 x 3
##   subject_race               n  rsum
##   <chr>                  <int> <int>
## 1 asian/pacific islander  1247 19938
## 2 black                   1631 19938
## 3 hispanic                6377 19938
## 4 other                   1279 19938
## 5 white                   9404 19938

We can see that 1631 out of 19938 individuals pulled over were black. Wikipedia states that 5.51% of the population in CA is of black race. We can test the hypothesis that \(H_0:\) The proportion of tested black race being 0.0551 is true \(H_1:\) The proportion of tested black race being 0.0551 is not true

prop.test(1631, 19938, 0.0551, conf.level = 0.95)
## 
##  1-sample proportions test with continuity correction
## 
## data:  1631 out of 19938, null probability 0.0551
## X-squared = 272.56, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.0551
## 95 percent confidence interval:
##  0.07805494 0.08571442
## sample estimates:
##          p 
## 0.08180359

The extremely low p-vale suggests we reject the null hypothesis. The estimated proportion is 0.081 with a 95% confidence interval (0.078, 0.085). This suggests that there is some bias towards selecting black drivers to be pulled over.

# library(broom)
df_chisq <- df %>% 
  group_by(subject_race,outcome) %>%  # the variables you want on the conteingency table
  summarise(n = n()) %>% # need the totals
  mutate(proportion = n/sum(n)) %>%
  select(-proportion) %>% # Oh you definitely need to get rid of proportion here so it spreads properly
  spread(outcome, n) %>%  # contingency table obtained! Also got proportions...and then got rid of them should make them separate
  ungroup() %>% # select will not remove in a grouped tibble
  select(-1) %>% 
  chisq.test # %>% 
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
  glance()
## # A tibble: 0 x 0
df_chisq
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 138.83, df = 16, p-value < 2.2e-16

Testing of association between subject_race and outcome. The p-value is less than 0.05 so we reject the null hypothesis of no association and conclude that there is a association between the row variables (race) and column variables (outcome). Let’s have a look at the expected counts:

round(as_tibble(df_chisq$expected),0)
## # A tibble: 5 x 5
##   arrest citation nothing summons warning
##    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1     19       33     402     630     163
## 2     25       43     526     824     213
## 3     99      168    2055    3223     832
## 4     20       34     412     646     167
## 5    147      247    3030    4753    1227

Compare it to the actual counts in data:

df %>% 
  group_by(subject_race,outcome) %>%  # the variables you want on the conteingency table
  summarise(n = n()) %>% # need the totals
  mutate(proportion = n/sum(n)) %>%
  select(-proportion) %>% # Oh you definitely need to get rid of proportion here so it spreads properly
  spread(outcome, n) %>%  # contingency table obtained! Also got proportions...and then got rid of them should make them separate
  ungroup()
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
## # A tibble: 5 x 6
##   subject_race           arrest citation nothing summons warning
##   <chr>                   <int>    <int>   <int>   <int>   <int>
## 1 asian/pacific islander     12       15     394     676     150
## 2 black                      34       25     481     868     223
## 3 hispanic                  115      213    2084    3232     733
## 4 other                       6       40     341     759     133
## 5 white                     144      231    3125    4542    1362

And the most contributing cells to the total chi-square score:

subject_race <- c("asian/pacific islander", "black", "hispanic", "other", "white")
chisqres <- as_tibble(df_chisq$residuals) %>% add_column(subject_race, .before = 1)
chisqres
## # A tibble: 5 x 6
##   subject_race           arrest citation nothing summons warning
##   <chr>                   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 asian/pacific islander -1.69     -3.10  -0.391   1.82   -0.994
## 2 black                   1.70     -2.73  -1.94    1.52    0.701
## 3 hispanic                1.56      3.51   0.640   0.158  -3.43 
## 4 other                  -3.12      1.10  -3.50    4.43   -2.62 
## 5 white                  -0.222    -1.03   1.72   -3.06    3.86

The cells with the highest absolute standardized residuals contribute the most to the total chi-square score. Let’s visualise this:

ggplot(data = melt(chisqres), aes(x = subject_race,y = variable )) +
  geom_raster(aes(fill = value)) +
  scale_fill_gradient(low = "green", high = "red")
## Warning in melt(chisqres): The melt generic in data.table has been passed a
## tbl_df and will attempt to redirect to the relevant reshape2 method; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(chisqres). In
## the next version, this warning will become an error.
## Using subject_race as id variables

It can be seen that the column black is strongly associated with summons/arrest/warning but not strongly associated with nothing and citation.

Classification models

model_df <- df %>% 
  filter(raw_search_basis == "Probable Cause (positive)" | raw_search_basis == "Probable Cause (negative)") %>% 
  mutate(across(where(is_character), as_factor)) %>% 
  select(subject_race, subject_sex, outcome, raw_search_basis, search_conducted) 
set.seed(42) 
rows <- sample(nrow(model_df)) # 11970 rows
# We'll use the first 8000 randomised entries for the model
train <- model_df[1:8000, ]
test <- model_df[8001:11970, ]
# And the rest for testing
model <- glm(raw_search_basis ~ subject_race, family = binomial(link = "logit"), data = train)
summary(model)
## 
## Call:
## glm(formula = raw_search_basis ~ subject_race, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0043  -0.8476  -0.8438   1.4079   1.6338  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -0.52712    0.04230 -12.463  < 2e-16 ***
## subject_racewhite                  -0.32245    0.05521  -5.841 5.20e-09 ***
## subject_raceother                  -0.31169    0.10131  -3.077  0.00209 ** 
## subject_raceblack                   0.10524    0.08752   1.202  0.22922    
## subject_raceasian/pacific islander -0.50201    0.10590  -4.740 2.13e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10115  on 7999  degrees of freedom
## Residual deviance: 10054  on 7995  degrees of freedom
## AIC: 10064
## 
## Number of Fisher Scoring iterations: 4
## T-test: Population Demographic Compared with Police Testing 

#We want to compare the proportion of black policing incidents to the population proportion.

We now have a model depicting the relationship between a search being positive or negative and the race of the person being pulled over. The p-values are interpreted as whether there is a difference between the log-odds of the outcome between the intercept and the explanatory variable. All of the predictors are significant using p < 0.1 as the criteria but if we use p < 0.05 which is more standard then the model would reject subject_raceblack as being useful in the model.

Let’s quickly see how R is dummy coding the variables:

contrasts(model_df$raw_search_basis)
##                           Probable Cause (negative)
## Probable Cause (positive)                         0
## Probable Cause (negative)                         1

Probable cause (positive) is coded as 0 while probable cause (negative) is coded as 1. We take exponential of the estimates for easier interpretation. Some of the notable interptations are:

  • An intercept of -0.50813 implies there is a \(exp(-0.50813) = 0.60\) probability that the pull-over for a hispanic is a false alarm
  • subject_raceblack estimate of 0.13792 means being black increases the log odds by 0.13792. being black has \(exp(0.13792) = 1.148\) odds of being a negative false alarm than hispanics (the reference or intercept category)
  • All other groups have negative estimates and so they are more likely to be Probable Cause(positive) than hispanics
    • White has an estimate of -0.33708, and so \(exp(-0.33708) = 0.714\) implies the average white has a \(0.6*0.714= 0.42\) chance of being a Probable Cause (negative). Which means pullovers have more than a 50% chance of being correct
    • Asian/Pacific Islander has an estimate of -0.49, and so \(exp(-0.49) = 0.61\) implies the average asian/pacific islander has a \(0.6*0.61 = 0.366\) chance of being a Probable Cause (negative). Which means pullovers have a 36% chance of being a Probable Cause (negative). I.e most of the pullovers due to suspicion were justified
# Showing this in quick view
exp(coef(model))
##                        (Intercept)                  subject_racewhite 
##                          0.5903054                          0.7243738 
##                  subject_raceother                  subject_raceblack 
##                          0.7322058                          1.1109739 
## subject_raceasian/pacific islander 
##                          0.6053139
ggplot(model_df, aes(raw_search_basis, fill = subject_race)) +
  geom_bar(position = "fill")

This makes sense when we look at the plot above, proportionally most of the false alarms

Now lets see the classification rate when we test the predictive ability of the model.

test1 <- test %>% select(1,4)
probabilities <-  predict(model, newdata = test1, type = "response")
predicted.class <- ifelse(probabilities > 0.5, "Probable Cause (negative)", "Probable Cause (positive)")
model_dfp <- df %>% 
  filter(raw_search_basis == "Probable Cause (positive)" & subject_race != "other") %>% 
  mutate(across(where(is_character), as_factor)) %>% 
  select(subject_race, subject_sex, outcome, raw_search_basis, search_conducted) 
model_dfn <- df %>% 
  filter(raw_search_basis == "Probable Cause (negative)" & subject_race != "other") %>% 
  mutate(across(where(is_character), as_factor)) %>% 
  select(subject_race, subject_sex, outcome, raw_search_basis, search_conducted) 
set.seed(222)
indexp <- sample(seq_len(nrow(model_dfp)), size = 2000) 
indexn <- sample(seq_len(nrow(model_dfn)), size = 2000) 
train2 <- bind_rows(model_dfp[indexp,],model_dfn[indexn,])
test2 <- bind_rows(model_dfp[-indexp,],model_dfn[-indexn,])
model2 <- glm(raw_search_basis ~ subject_race, data = train2, family = binomial)
prob <- model2 %>% predict(test2, type = "response")
predicted.class <- ifelse(prob > 0.5, "Probable Cause (negative)", "Probable Cause(positive)")
# mean(predicted.class == test2$raw_search_basis)  something is going wrong here...and I can't figure it out my brain hurts
table(predicted.class,test2$raw_search_basis) # But this seems ok
##                            
## predicted.class             Probable Cause (positive) Probable Cause (negative)
##   Probable Cause (negative)                      2069                       817
##   Probable Cause(positive)                       3353                       882

The table shows 3353 correct positive and 817 correct negative predictions. This means the model has \((3353+817)/(3353+817+882+2069) = 58.5%\) chance of predicting correctly.

Poisson Regression

We are interested in whether there is a bias against persons of black race when it comes to vehicle policing in the “California Statewide” category. One of the questions we can ask is: is the proportion of black persons cases different compared to those of other races?

# Poisson regression
# library(MASS)
df %>% 
  group_by(subject_race,outcome) %>%  
  summarise(n = n())
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
## # A tibble: 25 x 3
## # Groups:   subject_race [5]
##    subject_race           outcome      n
##    <chr>                  <chr>    <int>
##  1 asian/pacific islander arrest      12
##  2 asian/pacific islander citation    15
##  3 asian/pacific islander nothing    394
##  4 asian/pacific islander summons    676
##  5 asian/pacific islander warning    150
##  6 black                  arrest      34
##  7 black                  citation    25
##  8 black                  nothing    481
##  9 black                  summons    868
## 10 black                  warning    223
## # ... with 15 more rows
p<- df %>% 
  group_by(subject_race,outcome) %>%  
  summarise(n = n()) 
## `summarise()` has grouped output by 'subject_race'. You can override using the `.groups` argument.
glm1 <- glm(n ~ subject_race + outcome, data = p, family = "poisson")
summary(glm1)
## 
## Call:
## glm(formula = n ~ subject_race + outcome, family = "poisson", 
##     data = p)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6719  -2.7177  -0.2225   1.5190   4.3076  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.96791    0.06299  47.120  < 2e-16 ***
## subject_raceblack     0.26845    0.03762   7.136 9.58e-13 ***
## subject_racehispanic  1.63196    0.03096  52.706  < 2e-16 ***
## subject_raceother     0.02534    0.03980   0.637    0.524    
## subject_racewhite     2.02039    0.03014  67.039  < 2e-16 ***
## outcomecitation       0.52170    0.07158   7.288 3.14e-13 ***
## outcomenothing        3.02816    0.05806  52.155  < 2e-16 ***
## outcomesummons        3.47822    0.05757  60.414  < 2e-16 ***
## outcomewarning        2.12386    0.06000  35.398  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 32423.14  on 24  degrees of freedom
## Residual deviance:   145.72  on 16  degrees of freedom
## AIC: 346.9
## 
## Number of Fisher Scoring iterations: 4
# checking for overdispersion
# library(AER)
AER::dispersiontest(glm1, trafo = 1)
## 
##  Overdispersion test
## 
## data:  glm1
## z = 4.1806, p-value = 1.454e-05
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##    alpha 
## 4.608596
# Based on: https://stats.stackexchange.com/questions/66586/is-there-a-test-to-determine-whether-glm-overdispersion-is-significant

We need to meet Poisson regression assumptions. Variable is a count: yes. The events must be independent - we can assume one person getting stopped does not make another person getting stopped more likely.

After obtaining the model (glm1), the summary shows that a lot of variables are significant (excluding raceother which is acceptable). We then decide to calculate the p-value for the deviance goodness of fit test:

pchisq(glm1$deviance, df = glm1$df.residual, lower.tail = FALSE)
## [1] 5.432394e-23

The null hypothesis is that glm1 is correctly specified, and this tiny p-value suggests that we reject the hypothesis. So this model fits badly.

Data_cor <- df[ ,map_lgl(df, is.numeric)]

# Data_cor is the dataset (not correlation matrix) you want to use
corrdata <- cor(na.omit(select_if(Data_cor, is.numeric)))
corrdata[upper.tri(corrdata, diag = TRUE)] <- NA
corrdata <- corrdata[-1, -ncol(corrdata)] # take out the first row (no 1-1 correlations)

# Storing variable names for later use
x_labels <- colnames(corrdata)
y_labels <- rownames(corrdata)

# Change variable names to numeric for the grid
colnames(corrdata) <- 1:ncol(corrdata)
rownames(corrdata) <- nrow(corrdata):1

# Melt the data into  the desired format
plotdata <- melt(corrdata)
## Warning in melt(corrdata): The melt generic in data.table has been passed a
## matrix and will attempt to redirect to the relevant reshape2 method; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(corrdata). In
## the next version, this warning will become an error.
# Adding size variable and scaling it. $value is the correlation value
plotdata$size <- (abs(plotdata$value))
scaling <- 500/ncol(corrdata)/2
plotdata$size <- plotdata$size*scaling 

# Setting x and y ranges for the chart
# We used unit values for initial grid, so shift by 0.5 to create gridlines
xrange <- c(0.5, length(x_labels) + 0.5)
yrange <- c(0.5, length(y_labels) + 0.5)

# Setting the gridlines
x_grid <- seq(1.5, length(x_labels) - 0.5, 1)
y_grid <- seq(1.5, length(x_labels) - 0.5, 1)

# Now some cleanup. Naming variables and removing gridlines

xAx1 <- list(showgrid = FALSE, showline = FALSE, zeroline = FALSE, tickvals = colnames(corrdata), ticktext = x_labels, title = FALSE)

xAx2 <- list(showgrid = FALSE, showline = FALSE, zeroline = FALSE, overlaying = "x", showticklabels = FALSE, range = xrange, tickvals = x_grid)

yAx1 <- list(autoaxis = FALSE, showgrid  = FALSE, showline = FALSE, zeroline = FALSE, tickvals = rownames(corrdata), ticktext = y_labels, title = FALSE)

yAx2 <- list(showgrid = TRUE, showline = FALSE, zeroline = FALSE, overlaying = "y", showticklabels = FALSE, range = yrange, tickvals = y_grid)

fig <- plot_ly(data = plotdata, width = 500, height = 500)

fig <- fig %>% add_trace(x = ~Var2, y = ~Var1, type = "scatter", mode = "markers",
                        color = ~value,
                        marker = list(size = ~size, opacity = 1),
                        symbol = I("square"),
                        text = ~value,
                        hovertemplate = "%{text:.2f} <extra></extra>",
                        xaxis = "x1",
                        yaxis = "y1")

fig <- fig %>% add_trace(x = ~Var2, y = ~Var1, type = "scatter", mode = "markers",
                        opacity = 0,
                        showlegend = FALSE,
                        xaxis = "x2",
                        yaxis = "y2",
                        hoverinfo = "none")

fig <- fig %>% layout(xaxis = xAx1,
                     yaxis = yAx1, 
                     xaxis2 = xAx2,
                     yaxis2 = yAx2,
                     plot_bgcolor = "rgba(0,0,0,0)",
                     paper_bgcolor = "rgba(0, 0, 0, 0.03)")

fig <- fig %>% colorbar(title = "", limits = c(-1,1), x = 1.1, y = 0.75)
fig